{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Naive Bayes - Trabalho\n",
    "\n",
    "## Questão 1\n",
    "\n",
    "Implemente um classifacor Naive Bayes para o problema de predizer a qualidade de um carro. Para este fim, utilizaremos um conjunto de dados referente a qualidade de carros, disponível no [UCI](https://archive.ics.uci.edu/ml/datasets/car+evaluation). Este dataset de carros possui as seguintes features e classe:\n",
    "\n",
    "** Attributos **\n",
    "1. buying: vhigh, high, med, low\n",
    "2. maint: vhigh, high, med, low\n",
    "3. doors: 2, 3, 4, 5, more\n",
    "4. persons: 2, 4, more\n",
    "5. lug_boot: small, med, big\n",
    "6. safety: low, med, high\n",
    "\n",
    "** Classes **\n",
    "1. unacc, acc, good, vgood\n",
    "\n",
    "## Questão 2\n",
    "Crie uma versão de sua implementação usando as funções disponíveis na biblioteca SciKitLearn para o Naive Bayes ([veja aqui](http://scikit-learn.org/stable/modules/naive_bayes.html)) \n",
    "\n",
    "## Questão 3\n",
    "\n",
    "Analise a acurácia dos dois algoritmos e discuta a sua solução."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 354,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "from sklearn.naive_bayes import GaussianNB\n",
    "from sklearn import model_selection as ms\n",
    "import math\n",
    "import random\n",
    "from collections import Counter"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Questão 01 (Solução)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 346,
   "metadata": {},
   "outputs": [],
   "source": [
    "class NaiveBayes:\n",
    "    \n",
    "    def __init__(self):\n",
    "        self._summaries = None\n",
    "        self._totalSize = 0\n",
    "        self._classValues = []\n",
    "    \n",
    "    def splitDataset(self, dataset, splitRatio):\n",
    "        trainSize = int(len(dataset) * splitRatio)\n",
    "        trainSet = []\n",
    "        copy = list(dataset)\n",
    "        while len(trainSet) < trainSize:\n",
    "            index = random.randrange(len(copy))\n",
    "            trainSet.append(copy.pop(index))\n",
    "        \n",
    "        return [trainSet, copy]\n",
    "    \n",
    "    def separateByClass(self, dataset):\n",
    "        separated = {}\n",
    "        for i in range(len(dataset)):\n",
    "            vector = dataset[i]\n",
    "            if (vector[-1] not in separated):\n",
    "                separated[vector[-1]] = []\n",
    "                self._classValues.append(vector[-1])\n",
    "            separated[vector[-1]].append(vector)\n",
    "        return separated\n",
    "    \n",
    "\n",
    "    def calculateProbability(self, x, attribute):\n",
    "        cnt = Counter(attribute)\n",
    "        return cnt[x]/len(attribute.index)\n",
    "\n",
    "    def calculateClassProbabilities(self, inputVector):\n",
    "        probabilities = {}\n",
    "        for classValue in self._classValues:\n",
    "            probabilities[classValue] = 1\n",
    "            attributes = pd.DataFrame(self._summaries[classValue])\n",
    "            for i in range(len(inputVector)):\n",
    "                x = inputVector[i]\n",
    "                probabilities[classValue] *= self.calculateProbability(x, attributes.iloc[: , i])\n",
    "        return probabilities\n",
    "    \n",
    "    def fit(self, dataset):\n",
    "        self._totalSize = len(dataset)\n",
    "        self._summaries = self.separateByClass(dataset)\n",
    "    \n",
    "    def predict(self, inputVector):\n",
    "        probabilities = self.calculateClassProbabilities(inputVector)\n",
    "        bestLabel, bestProb = None, -1\n",
    "        for classValue, probability in probabilities.items():\n",
    "            if bestLabel is None or probability > bestProb:\n",
    "                bestProb = probability\n",
    "                bestLabel = classValue\n",
    "        return bestLabel\n",
    "\n",
    "    def getPredictions(self, testSet):\n",
    "        predictions = []\n",
    "        for i in range(len(testSet)):\n",
    "            result = self.predict(testSet[i])\n",
    "            predictions.append(result)\n",
    "        return predictions\n",
    "    \n",
    "    def getAccuracy(self, testSet, predictions):\n",
    "        correct = 0\n",
    "        for i in range(len(testSet)):\n",
    "            (b, m, d, p, l, s, _class) = testSet[i]\n",
    "            if _class == predictions[i]:\n",
    "                correct += 1\n",
    "        return (correct/float(len(testSet))) * 100.0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 347,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "import csv\n",
    " \n",
    "def loadCsv(filename):\n",
    "    lines = csv.reader(open(filename, \"r\"))\n",
    "    dataset = list(lines)\n",
    "    return dataset\n",
    "\n",
    "df = loadCsv('carData.csv')\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 349,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Handmade NaiveBayes accuracy: 100.0\n"
     ]
    }
   ],
   "source": [
    "NB = NaiveBayes()\n",
    "trainSet, testSet = NB.splitDataset(df, 0.8)\n",
    "NB.fit(trainSet)\n",
    "predictions = NB.getPredictions(testSet)\n",
    "print(\"Handmade NaiveBayes accuracy: {0}\".format(NB.getAccuracy(testSet, predictions)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 359,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def splitXY(dataset):\n",
    "    X = []\n",
    "    Y = []\n",
    "    for i in range(len(dataset)):\n",
    "        vector = dataset[i]\n",
    "        Y.append(vector[-1])\n",
    "        vector.remove(vector[-1])\n",
    "        X.append(vector)\n",
    "    return X, Y"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 360,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/lucas/miniconda3/envs/py36/lib/python3.6/site-packages/sklearn/utils/validation.py:578: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n",
      "  y = column_or_1d(y, warn=True)\n"
     ]
    },
    {
     "ename": "ValueError",
     "evalue": "Found input variables with inconsistent numbers of samples: [1382, 346]",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mValueError\u001b[0m                                Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-360-891f5d535020>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m      2\u001b[0m \u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mY\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0msplitXY\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      3\u001b[0m \u001b[0mX_train\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_train\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mX_test\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_test\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mms\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtrain_test_split\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mY\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtest_size\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m0.2\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 4\u001b[0;31m \u001b[0mNB\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX_train\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_train\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      5\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Sklearn Gaussian NaiveBayes accuracy: {0}\"\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mNB\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mscore\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX_test\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_test\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/miniconda3/envs/py36/lib/python3.6/site-packages/sklearn/naive_bayes.py\u001b[0m in \u001b[0;36mfit\u001b[0;34m(self, X, y, sample_weight)\u001b[0m\n\u001b[1;32m    181\u001b[0m             \u001b[0mReturns\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    182\u001b[0m         \"\"\"\n\u001b[0;32m--> 183\u001b[0;31m         \u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcheck_X_y\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    184\u001b[0m         return self._partial_fit(X, y, np.unique(y), _refit=True,\n\u001b[1;32m    185\u001b[0m                                  sample_weight=sample_weight)\n",
      "\u001b[0;32m~/miniconda3/envs/py36/lib/python3.6/site-packages/sklearn/utils/validation.py\u001b[0m in \u001b[0;36mcheck_X_y\u001b[0;34m(X, y, accept_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, warn_on_dtype, estimator)\u001b[0m\n\u001b[1;32m    581\u001b[0m         \u001b[0my\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mastype\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfloat64\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    582\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 583\u001b[0;31m     \u001b[0mcheck_consistent_length\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    584\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    585\u001b[0m     \u001b[0;32mreturn\u001b[0m \u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/miniconda3/envs/py36/lib/python3.6/site-packages/sklearn/utils/validation.py\u001b[0m in \u001b[0;36mcheck_consistent_length\u001b[0;34m(*arrays)\u001b[0m\n\u001b[1;32m    202\u001b[0m     \u001b[0;32mif\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0muniques\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m>\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    203\u001b[0m         raise ValueError(\"Found input variables with inconsistent numbers of\"\n\u001b[0;32m--> 204\u001b[0;31m                          \" samples: %r\" % [int(l) for l in lengths])\n\u001b[0m\u001b[1;32m    205\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    206\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mValueError\u001b[0m: Found input variables with inconsistent numbers of samples: [1382, 346]"
     ]
    }
   ],
   "source": [
    "NB = GaussianNB()\n",
    "X, Y = splitXY(df)\n",
    "X_train, y_train, X_test, y_test = ms.train_test_split(X, Y, test_size=0.2)\n",
    "NB.fit(X_train, y_train)\n",
    "print(\"Sklearn Gaussian NaiveBayes accuracy: {0}\".format(NB.score(X_test, y_test)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
